# Imports
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
# Variables
filename = 'exam_2020_01_07_data.csv'
separator = ','
target = 'Class'
random_state = 42
# Directives
%matplotlib inline
np.random.seed(random_state)
# Load the file and check that everything is in good shape
df = pd.read_csv(filename, sep = separator)
df.head()
| C00 | C01 | C02 | C03 | C04 | Class | |
|---|---|---|---|---|---|---|
| 0 | 1.855416 | 0.466367 | -0.176765 | 1.546514 | 0.149219 | c |
| 1 | -0.107873 | -0.136792 | 1.551591 | -0.813810 | 1.357674 | a |
| 2 | 2.712560 | -0.495846 | 1.397077 | 1.483562 | 1.656526 | b |
| 3 | -2.166084 | -0.582271 | 0.353011 | -1.864210 | -2.267033 | b |
| 4 | 2.848831 | -0.507369 | 1.661752 | 1.466627 | 1.938519 | b |
# Show pairplots, use our target as hue to highlight any possible pattern
sns.pairplot(df, hue = target)
<seaborn.axisgrid.PairGrid at 0x2140fea7070>
The pairplots don't seem to show any particular pattern in the data.
For this classification assignment, we will use a Decision Tree.
We start by dividing the features matrix and the target column, as follows:
# Divide our data between feature matrix and class labels
X = df.drop(target, axis = 1)
y = df[target]
We now divide the data in training and test set by means of the train_test_split function
# We will use a 2/3 ratio between the training and test data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size = 2/3, random_state = random_state)
print(f"We have {Xtrain.shape[0]} items in our training set")
print(f"We have {Xtest.shape[0]} items in our test set")
We have 666 items in our training set We have 334 items in our test set
We now instantiate a Decision Tree and fit it on the training data.
We then use it to predict the training values, using the accuracy_score function to see the accuracy on the training set
# Estimator creation and fitting
estimator = tree.DecisionTreeClassifier(criterion="entropy", random_state = random_state)
estimator.fit(Xtrain, ytrain)
# Prediction and test set accuracy results
train_set_prediction = estimator.predict(Xtrain)
train_set_accuracy = accuracy_score(ytrain, train_set_prediction) * 100
print(f"The accuracy on the training set was {train_set_accuracy:.2f}%")
The accuracy on the training set was 100.00%
To have a more meaningful result, we will try it on the test set as well, to obtain a "baseline" value for the performance of our classifier
test_set_prediction = estimator.predict(Xtest)
test_set_accuracy = accuracy_score(ytest, test_set_prediction) * 100
print(f"The accuracy on the test set was {test_set_accuracy:.2f}%")
The accuracy on the test set was 84.13%
Since our assignment is to use the train-validation-test schema, we will split once more the test data into a test and validation set
Xtrain_t, Xval, ytrain_t, yval = train_test_split(Xtest, ytest, random_state = random_state)
print(f"We have {Xtest.shape[0]} items in our test set")
print(f"We have {Xval.shape[0]} items in our validation set")
We have 334 items in our test set We have 84 items in our validation set
Now we can save the depth of the tree with default hyperparameters. This way, we can vary the depths in order to see what is the best fit for our data.
default_tree_depth = estimator.tree_.max_depth
depths = range(1, default_tree_depth + 1)
scores = []
for depth in depths:
# Create a Decision Tree with limited depth
estimator = tree.DecisionTreeClassifier(criterion="entropy", max_depth = depth, random_state = random_state)
# Fit it on our test data
estimator.fit(Xtrain_t, ytrain_t)
# Try predicting the class and save the accuracy of this classifier
prediction = estimator.predict(Xval)
score = accuracy_score(yval, prediction) * 100
scores.append(score)
We now have a look at the accuracy scores that we obtained in the previous step
plt.figure(figsize=(32,20))
plt.plot(depths, scores, '-o', linewidth=5, markersize=24)
plt.xlabel('Max depth')
plt.ylabel('Accuracy')
plt.title("Score with validation varying the max_depth of the tree", fontsize = 24)
plt.show();
The best hyperparameter configuration is the one that maximises the accuracy
best_depth = depths[np.argmax(scores)]
# We must remove 1 from best depth in order to obtain the correct score, as its "zero" value is 1
print(f"The best depth parameter was {best_depth}, with an accuracy of {scores[best_depth - 1]:.2f}%")
The best depth parameter was 8, with an accuracy of 78.57%
We now test our tuned hyperparameter on the old training data and compute its accuracy
estimator = tree.DecisionTreeClassifier(criterion="entropy", max_depth = best_depth, random_state = random_state)
estimator.fit(Xtrain, ytrain)
prediction = estimator.predict(Xtest)
print(f"The accuracy of the tuned Decision Tree is {accuracy_score(ytest, prediction) * 100:.2f}%")
The accuracy of the tuned Decision Tree is 84.13%
The results show that we obtained the same accuracy but with a smaller tree
print(f"The depth of the original tree was {default_tree_depth} and it is now {best_depth}")
The depth of the original tree was 15 and it is now 8
Let us choose C01 and C02 as the attribute pair we will consider
attributes = ['C01', 'C02']
# We want to plot the test set, so we need to add the target column
# to the dataframe in order to use it as hue
df_test = Xtest.assign(y_predicted = prediction)
sns.scatterplot(x = attributes[0], y = attributes[1], data = df_test, hue = 'y_predicted')
<AxesSubplot:xlabel='C01', ylabel='C02'>
We notice that the items of class c tend to be in the lower part of the plot
sns.scatterplot(x = attributes[0], y = attributes[1], data = df_test, hue = 'y_predicted', style = ytest == df_test['y_predicted'])
<AxesSubplot:xlabel='C01', ylabel='C02'>